#!/usr/bin/env python
# The function unknown() takes a URL as its argument, and returns a list of
# unknown words that occur on that web page.This is my answer to exercise 3.21
# in book Natural Language Processing with Python.
#
# Guan Gui (ggui, 312348)
# $Rev: 17 $
# $LastChangedDate: 2010-03-09 14:36:25 +0000 (Tue, 09 Mar 2010) $

from urllib import urlopen
import nltk
import re
import sys

def unknown(url):
    html = None
    try:
        html = urlopen(url).read()
    except IOError:
        print "Error: can\'t reach the given url"
        return
    raw = nltk.clean_html(html)
    words = None
    try:
        words = nltk.corpus.words.words()
    except LookupError:
        print "Cannot find corpora/words. Please download one."
        nltk.download()
        words = nltk.corpus.words.words()
    word_set = set(re.findall(r'\w+', raw))
    word_set = [ w for w in word_set if not(w.lower() in words) ]
    print word_set

if __name__ == "__main__":
    url = ""
    if (len(sys.argv) == 2):
        url = sys.argv[1]
    else:
        sys.stdout.write('Please specify an URL: ')
        url = sys.stdin.readline().rstrip('\n')
    unknown(url)
